package com.chance.cc.crawler.prod.command.job.domain.news.ifeng.realtime;

import com.alibaba.fastjson.JSON;
import com.chance.cc.crawler.core.CrawlerEnum;
import com.chance.cc.crawler.core.CrawlerJob;
import com.chance.cc.crawler.core.downloader.HttpConfig;
import com.chance.cc.crawler.core.downloader.HttpPage;
import com.chance.cc.crawler.core.filter.FilterUtils;
import com.chance.cc.crawler.core.record.CrawlerRecord;
import com.chance.cc.crawler.core.record.CrawlerRequestRecord;
import com.chance.cc.crawler.meta.core.bean.CrawlerMetaConstant;
import com.chance.cc.crawler.meta.core.bean.job.CrawlerScheduleJob;
import com.chance.cc.crawler.prod.command.job.domain.news.NewsCommonScript;
import org.apache.commons.lang3.StringUtils;

import static com.chance.cc.crawler.core.CrawlerEnum.CrawlerRequestType.filter;
import static com.chance.cc.crawler.development.scripts.allfeild.AICCommonField.Tag_Site_Info;

/**
 * @ClassName song
 * @Description TODO
 * @Author ding
 * @Date 2021/9/3 11:24
 * @Version 1.0
 * 凤凰网 车系文章采集
 **/
public class IFengArticleRealtimeCrawlerSchedulerJob extends NewsCommonScript {
    public static final String domain = "ifeng";
    private static final String crawler_level = "realtime";
    private static final String siteBiz = "geely";
    private static final String site = "SeriesArticle";

    public static void main(String[] args) {
       publishCrawlerScheduleJobInfo();

    }
    /**
     * 发布任务信息
     * @return
     */
    public static CrawlerJob publishCrawlerScheduleJobInfo(){

        CrawlerJob crawlerJob = crawlerScheduler();

        //发布定时任务
        CrawlerScheduleJob crawlerScheduleJob = new CrawlerScheduleJob();
        crawlerScheduleJob.setDomain(domain);
        crawlerScheduleJob.setCrawlerJob(JSON.toJSONString(crawlerJob));
        crawlerScheduleJob.setJobType(CrawlerMetaConstant.ScheduleCrawlerJobType.crawler.enumVal());
        crawlerScheduleJob.setNote("凤凰新闻汽车板块 车系文章每天实时采集任务");
        crawlerScheduleJob.setCrawlerKey(crawlerJob.generateCrawlerKey());
        HttpPage httpPage = metaServiceCommand.addOrUpdateCrawlerScheduleJob(crawlerScheduleJob);
        System.out.println("发布任务：" + httpPage.getRawText());
        return crawlerJob;
    }

    public static CrawlerJob crawlerScheduler(){

        //文章采集
        CrawlerRequestRecord articleCrawler = doSearchNewsCrawler();

        CrawlerRequestRecord keyRecord = CrawlerRequestRecord.builder()
                .startPageRequest(domain, CrawlerEnum.CrawlerRequestType.turnPageItem)
                .httpUrl("http://192.168.1.215:9599/v1/meta/ifeng/keys?site=SeriesArticle")  //todo 关键字传入
                .requestLabelTag(CrawlerEnum.CrawlerRequestType.supportSource)
                .requestLabelTag(CrawlerEnum.CrawlerRequestType.internalDownload)
                .build();


        CrawlerJob crawlerJob = CrawlerJob.builder()
                .triggerInfo(domain,
                        CrawlerMetaConstant.ScheduleJobTrigger_Cron,
                        System.currentTimeMillis(),//ifeng-SeriesArticle-geely-realtime-realtime
                        StringUtils.joinWith("-",  site,siteBiz, crawler_level, CrawlerMetaConstant.ScheduleJobTriggerJob_Realtime))
                .crawlerRequestQueue(CrawlerMetaConstant.redisRequestQueue(StringUtils.joinWith("-", "crawler", domain, site, crawler_level, "queue")))
                .fileResultPipeline("kafka","/data/chance_crawler_runner/logs/node/ifeng.log",false)
                .kafkaResultPipeline("kafka", kafkaTopic, null)
                .crawlerJobThreadNumber(10)
                .requestRecord(articleCrawler)
                .supportRecord(keyRecord)
                .build();
        //添加评论去重信息
        CrawlerRecord filterCrawlerRecord = new CrawlerRecord();
        filterCrawlerRecord.setFilter(CrawlerEnum.CrawlerRecordFilter.keyOrDateRange);
        filterCrawlerRecord.addFilterInfo(FilterUtils.redisFilterKeyInfo(StringUtils.joinWith("-",filter,domain,site,"comment")));
        filterCrawlerRecord.addFilterInfo(FilterUtils.dateRangeFilterInfo(24*1,null));
        articleCrawler.tagsCreator().bizTags().addCustomKV("comment_record_filter_info", JSON.toJSONString(filterCrawlerRecord));
        crawlerJob.getScheduleTags().getCategoryTag().addLabelTag(CrawlerEnum.CrawlerDataType.comment.enumVal());
        return crawlerJob;
    }

    /**
     * 初始record
     * @return
     */
    public static CrawlerRequestRecord doSearchNewsCrawler(){

        CrawlerRequestRecord requestRecord = CrawlerRequestRecord.builder()
                .startPageRequest(domain, CrawlerEnum.CrawlerRequestType.turnPage)
                .domain(domain)
                .recordKey("https://ncar.auto.ifeng.com/model/modelArticlePage?serialId=11525")
                .httpUrl("https://ncar.auto.ifeng.com/model/modelArticlePage?serialId=11525")
                .releaseTime(System.currentTimeMillis())
                .httpConfig(HttpConfig.me(domain))
                .filter(CrawlerEnum.CrawlerRecordFilter.keyOrDateRange)
                //.filter(CrawlerEnum.CrawlerRecordFilter.dateRange)
                .addFilterInfo(FilterUtils.redisFilterKeyInfo(domain))
                .addFilterInfo(FilterUtils.dateRangeFilterInfo(24*7, null))
                .resultLabelTag(CrawlerEnum.CrawlerDataType.article)
                .resultLabelTag(CrawlerEnum.CrawlerDataType.interaction)
                .resultLabelTag(CrawlerEnum.CrawlerDataType.comment)
                .proxy(proxy)
                .needParsed(true)
                .needWashed(false)
                .build();

        requestRecord.tagsCreator().bizTags().addDomain(domain);
        requestRecord.tagsCreator().bizTags().addSite(site);
        requestRecord.tagsCreator().bizTags().addSiteBiz(siteBiz);
        requestRecord.tagsCreator().bizTags().addCustomKV(Tag_Site_Info,site);
        return requestRecord;
    }
}

